{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "pip install openai==0.28"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "peynC3Ni_xn_",
        "outputId": "266ebff9-8f2c-4f85-d54e-3fd250e2c1b7"
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting openai==0.28\n",
            "  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)\n",
            "Requirement already satisfied: requests>=2.20 in /usr/local/lib/python3.11/dist-packages (from openai==0.28) (2.32.3)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from openai==0.28) (4.67.1)\n",
            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.11/dist-packages (from openai==0.28) (3.11.15)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.20->openai==0.28) (3.4.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.20->openai==0.28) (3.10)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.20->openai==0.28) (2.4.0)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.20->openai==0.28) (2025.4.26)\n",
            "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->openai==0.28) (2.6.1)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp->openai==0.28) (1.3.2)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->openai==0.28) (25.3.0)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp->openai==0.28) (1.6.0)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp->openai==0.28) (6.4.4)\n",
            "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->openai==0.28) (0.3.1)\n",
            "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->openai==0.28) (1.20.0)\n",
            "Downloading openai-0.28.0-py3-none-any.whl (76 kB)\n",
            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/76.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.5/76.5 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: openai\n",
            "  Attempting uninstall: openai\n",
            "    Found existing installation: openai 1.81.0\n",
            "    Uninstalling openai-1.81.0:\n",
            "      Successfully uninstalled openai-1.81.0\n",
            "Successfully installed openai-0.28.0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import time\n",
        "import random\n",
        "import openai\n",
        "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix\n",
        "from sklearn.utils import resample\n",
        "from openai.error import RateLimitError"
      ],
      "metadata": {
        "id": "hObXktiB-r6e"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Connecting to Google Drive"
      ],
      "metadata": {
        "id": "psTaLDP82HXd"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "\n",
        "# Mount your Google Drive\n",
        "drive.mount('/content/drive')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pVUyBxYC2ROt",
        "outputId": "287b4631-6551-4468-de0c-09a043d2b5a3"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Loading the PAT2TM concordance (with positive and negative Zij values)"
      ],
      "metadata": {
        "id": "Fuka7Axf2XXU"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "z_values = pd.read_csv('/content/drive/My Drive/colab/p2t_validation/data/P2T_withNegativeZij.csv')\n",
        "z_values.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "7u0AdQYQ4th7",
        "outputId": "58092e60-1611-4573-d0c5-e9a26a63b31c"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   Unnamed: 0 CPC_4digit Nice_subclass       Zij  \\\n",
              "0           1       B41F    1_1/13_0.1 -1.831201   \n",
              "1           2       B21L    1_1/13_0.1 -0.250157   \n",
              "2           3       B41L    1_1/13_0.1 -0.334290   \n",
              "3           4       C08L    1_1/13_0.1  2.504943   \n",
              "4           5       B41G    1_1/13_0.1 -0.149496   \n",
              "\n",
              "                                    CPC_4digit_label  \\\n",
              "0                       Printing Machines or Presses   \n",
              "1                                Making Metal Chains   \n",
              "2  Apparatus or Devices for Manifolding, Duplicat...   \n",
              "3           Compositions of Macromolecular Compounds   \n",
              "4  Apparatus for Bronze Printing, Line Printing, ...   \n",
              "\n",
              "                               Nice_subclass_keyword  \n",
              "0  Chemical | Composite | Water | Compound | Manu...  \n",
              "1  Chemical | Composite | Water | Compound | Manu...  \n",
              "2  Chemical | Composite | Water | Compound | Manu...  \n",
              "3  Chemical | Composite | Water | Compound | Manu...  \n",
              "4  Chemical | Composite | Water | Compound | Manu...  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-11d19e8f-f4b8-450e-a81b-a6d9ebe23838\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Unnamed: 0</th>\n",
              "      <th>CPC_4digit</th>\n",
              "      <th>Nice_subclass</th>\n",
              "      <th>Zij</th>\n",
              "      <th>CPC_4digit_label</th>\n",
              "      <th>Nice_subclass_keyword</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>B41F</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>-1.831201</td>\n",
              "      <td>Printing Machines or Presses</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2</td>\n",
              "      <td>B21L</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>-0.250157</td>\n",
              "      <td>Making Metal Chains</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3</td>\n",
              "      <td>B41L</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>-0.334290</td>\n",
              "      <td>Apparatus or Devices for Manifolding, Duplicat...</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>4</td>\n",
              "      <td>C08L</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>2.504943</td>\n",
              "      <td>Compositions of Macromolecular Compounds</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>5</td>\n",
              "      <td>B41G</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>-0.149496</td>\n",
              "      <td>Apparatus for Bronze Printing, Line Printing, ...</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-11d19e8f-f4b8-450e-a81b-a6d9ebe23838')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-11d19e8f-f4b8-450e-a81b-a6d9ebe23838 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-11d19e8f-f4b8-450e-a81b-a6d9ebe23838');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    <div id=\"df-985b83a7-9665-4612-9cd3-51bc1afe73be\">\n",
              "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-985b83a7-9665-4612-9cd3-51bc1afe73be')\"\n",
              "                title=\"Suggest charts\"\n",
              "                style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "      </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "      <script>\n",
              "        async function quickchart(key) {\n",
              "          const quickchartButtonEl =\n",
              "            document.querySelector('#' + key + ' button');\n",
              "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "          try {\n",
              "            const charts = await google.colab.kernel.invokeFunction(\n",
              "                'suggestCharts', [key], {});\n",
              "          } catch (error) {\n",
              "            console.error('Error during call to suggestCharts:', error);\n",
              "          }\n",
              "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "        }\n",
              "        (() => {\n",
              "          let quickchartButtonEl =\n",
              "            document.querySelector('#df-985b83a7-9665-4612-9cd3-51bc1afe73be button');\n",
              "          quickchartButtonEl.style.display =\n",
              "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "        })();\n",
              "      </script>\n",
              "    </div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "z_values"
            }
          },
          "metadata": {},
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# One can create the PAT2TM concordance based on statistically significant z-scores (i.e., z-scores greater than zero).\n",
        "p2t = z_values[z_values['Zij'] > 0]\n",
        "p2t.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "z2f47mKvG7Ob",
        "outputId": "53d1703f-0538-4580-fe13-87b3b89af074"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "    Unnamed: 0 CPC_4digit Nice_subclass       Zij  \\\n",
              "3            4       C08L    1_1/13_0.1  2.504943   \n",
              "7            8       C23C    1_1/13_0.1  3.069482   \n",
              "9           10       C08K    1_1/13_0.1  3.873043   \n",
              "11          12       C07F    1_1/13_0.1  3.394264   \n",
              "12          13       G03C    1_1/13_0.1  1.053844   \n",
              "\n",
              "                                     CPC_4digit_label  \\\n",
              "3            Compositions of Macromolecular Compounds   \n",
              "7   Coating Metallic Material; Coating Material Wi...   \n",
              "9   Use of inorganic or Non-Macromolecular organic...   \n",
              "11  Acyclic, Carbocyclic or Heterocyclic Compounds...   \n",
              "12  Photosensitive Materials for Photographic Purp...   \n",
              "\n",
              "                                Nice_subclass_keyword  \n",
              "3   Chemical | Composite | Water | Compound | Manu...  \n",
              "7   Chemical | Composite | Water | Compound | Manu...  \n",
              "9   Chemical | Composite | Water | Compound | Manu...  \n",
              "11  Chemical | Composite | Water | Compound | Manu...  \n",
              "12  Chemical | Composite | Water | Compound | Manu...  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-79fb7049-66db-4375-8e83-5c6bf479aade\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Unnamed: 0</th>\n",
              "      <th>CPC_4digit</th>\n",
              "      <th>Nice_subclass</th>\n",
              "      <th>Zij</th>\n",
              "      <th>CPC_4digit_label</th>\n",
              "      <th>Nice_subclass_keyword</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>4</td>\n",
              "      <td>C08L</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>2.504943</td>\n",
              "      <td>Compositions of Macromolecular Compounds</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>8</td>\n",
              "      <td>C23C</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>3.069482</td>\n",
              "      <td>Coating Metallic Material; Coating Material Wi...</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>10</td>\n",
              "      <td>C08K</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>3.873043</td>\n",
              "      <td>Use of inorganic or Non-Macromolecular organic...</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>11</th>\n",
              "      <td>12</td>\n",
              "      <td>C07F</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>3.394264</td>\n",
              "      <td>Acyclic, Carbocyclic or Heterocyclic Compounds...</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>12</th>\n",
              "      <td>13</td>\n",
              "      <td>G03C</td>\n",
              "      <td>1_1/13_0.1</td>\n",
              "      <td>1.053844</td>\n",
              "      <td>Photosensitive Materials for Photographic Purp...</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-79fb7049-66db-4375-8e83-5c6bf479aade')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-79fb7049-66db-4375-8e83-5c6bf479aade button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-79fb7049-66db-4375-8e83-5c6bf479aade');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    <div id=\"df-c52a10b7-b92f-4726-863d-f71cd4cd85da\">\n",
              "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-c52a10b7-b92f-4726-863d-f71cd4cd85da')\"\n",
              "                title=\"Suggest charts\"\n",
              "                style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "      </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "      <script>\n",
              "        async function quickchart(key) {\n",
              "          const quickchartButtonEl =\n",
              "            document.querySelector('#' + key + ' button');\n",
              "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "          try {\n",
              "            const charts = await google.colab.kernel.invokeFunction(\n",
              "                'suggestCharts', [key], {});\n",
              "          } catch (error) {\n",
              "            console.error('Error during call to suggestCharts:', error);\n",
              "          }\n",
              "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "        }\n",
              "        (() => {\n",
              "          let quickchartButtonEl =\n",
              "            document.querySelector('#df-c52a10b7-b92f-4726-863d-f71cd4cd85da button');\n",
              "          quickchartButtonEl.style.display =\n",
              "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "        })();\n",
              "      </script>\n",
              "    </div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "p2t",
              "summary": "{\n  \"name\": \"p2t\",\n  \"rows\": 41228,\n  \"fields\": [\n    {\n      \"column\": \"Unnamed: 0\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 122837,\n        \"min\": 4,\n        \"max\": 407785,\n        \"num_unique_values\": 41228,\n        \"samples\": [\n          14401,\n          182641,\n          312729\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"CPC_4digit\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 656,\n        \"samples\": [\n          \"B82B\",\n          \"A01G\",\n          \"F02F\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Nice_subclass\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 611,\n        \"samples\": [\n          \"14_5/19_0.05\",\n          \"24_11/12_0.03\",\n          \"12_6/16_0.04\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Zij\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 6.991583131106749,\n        \"min\": 1.54767461646319e-05,\n        \"max\": 234.667967105517,\n        \"num_unique_values\": 40342,\n        \"samples\": [\n          2.41060931329764,\n          1.02498013871603,\n          0.882815389014795\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"CPC_4digit_label\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 656,\n        \"samples\": [\n          \"Nanostructures formed by Manipulation of individual Atoms,\\r\\r\\r\\nMolecules, or Limited Collections of Atoms or Molecules As\\r\\r\\r\\nDiscrete Units; Manufacture or Treatment thereof\",\n          \"Horticulture; Cultivation of Vegetables, Flowers, Rice, Fruit, Vines, Hops or Seaweed; Forestry; Watering \",\n          \"Cylinders, Pistons or Casings, for Combustion Engines; Arrangements of Sealings in Combustion Engines \"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Nice_subclass_keyword\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 611,\n        \"samples\": [\n          \"Bracelet | Gold | Bead | Necklace | Chain\",\n          \"Cloth | Fabric | Article | Line | Manufacture\",\n          \"Vehicle | Carrier | Rack | Roof | Luggage\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Setting up the OpenAI API key for authentication and access"
      ],
      "metadata": {
        "id": "X76HxBE507XX"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "h0GsyOdz8kYw",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "5bbee420-0751-4594-c5db-38ec84272707"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "The capital of France is Paris.\n"
          ]
        }
      ],
      "source": [
        "# Uncomment the line below and add your API key\n",
        "# openai.api_key = \"your-api-key-here\"\n",
        "\n",
        "response = openai.ChatCompletion.create(\n",
        "    model=\"gpt-4-turbo\",  # Specify the model you want to use\n",
        "    messages=[\n",
        "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
        "        {\"role\": \"user\", \"content\": \"What is the capital of France?\"}\n",
        "    ],\n",
        "    temperature=0  # Setting temperature to 0 for consistent responses\n",
        ")\n",
        "\n",
        "# Print the response from the model\n",
        "print(response['choices'][0]['message']['content'])"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [],
      "metadata": {
        "id": "M5QZ34FH_AQt"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Validation 1\n",
        "We randomly select 1,000 pairs of CPC technologies and two sets of Nice subclasses based on whether their Zij values were greater or less than zero. Our expectation is that GPT-4 will recognize technologies and Nice subclasses with positive Zij values as more closely related compared to those with negative Zij values.\n"
      ],
      "metadata": {
        "id": "J_qfI5da_exC"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "z_values_positive = z_values[z_values['Zij'] > 0]\n",
        "z_values_negative = z_values[z_values['Zij'] <= 0]"
      ],
      "metadata": {
        "id": "45zBztSH_lEy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Setting the number of iterations\n",
        "N = 1000\n",
        "M = N//2"
      ],
      "metadata": {
        "id": "r1UUoCY48peW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "input_df_pn = pd.DataFrame()\n",
        "\n",
        "for i in range(M):\n",
        "    random_row_1 = z_values_positive.sample(n=1)\n",
        "\n",
        "    random_cpc_value = random_row_1['CPC_4digit'].values[0]\n",
        "\n",
        "    matching_cpc_rows = z_values_negative[z_values_negative['CPC_4digit'] == random_cpc_value]\n",
        "\n",
        "    if not matching_cpc_rows.empty:\n",
        "        random_row_2 = matching_cpc_rows.sample(n=1)\n",
        "\n",
        "        descriptor = random_row_1['CPC_4digit_label'].values[0]\n",
        "        to1 = random_row_1['Nice_subclass_keyword'].values[0]\n",
        "        to2 = random_row_2['Nice_subclass_keyword'].values[0]\n",
        "        Zij_to1 = random_row_1['Zij'].values[0]\n",
        "        Zij_to2 = random_row_2['Zij'].values[0]\n",
        "\n",
        "        new_row = pd.DataFrame({\n",
        "            'descriptor': [descriptor],\n",
        "            'to1': [to1],\n",
        "            'to2': [to2],\n",
        "            'Zij_to1': [Zij_to1],\n",
        "            'Zij_to2': [Zij_to2]\n",
        "        })\n",
        "\n",
        "        new_row['p2t'] = [1 if Zij_to1 > Zij_to2 else 0]\n",
        "\n",
        "        input_df_pn = pd.concat([input_df_pn, new_row], ignore_index=True)"
      ],
      "metadata": {
        "id": "5LOakkjz39KO"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "input_df_np = pd.DataFrame()\n",
        "\n",
        "for i in range(M):\n",
        "    random_row_1 = z_values_negative.sample(n=1)\n",
        "\n",
        "    random_cpc_value = random_row_1['CPC_4digit'].values[0]\n",
        "\n",
        "    matching_cpc_rows = z_values_positive[z_values_positive['CPC_4digit'] == random_cpc_value]\n",
        "\n",
        "\n",
        "    if not matching_cpc_rows.empty:\n",
        "        random_row_2 = matching_cpc_rows.sample(n=1)\n",
        "\n",
        "        descriptor = random_row_1['CPC_4digit_label'].values[0]\n",
        "        to1 = random_row_1['Nice_subclass_keyword'].values[0]\n",
        "        to2 = random_row_2['Nice_subclass_keyword'].values[0]\n",
        "        Zij_to1 = random_row_1['Zij'].values[0]\n",
        "        Zij_to2 = random_row_2['Zij'].values[0]\n",
        "\n",
        "        new_row = pd.DataFrame({\n",
        "            'descriptor': [descriptor],\n",
        "            'to1': [to1],\n",
        "            'to2': [to2],\n",
        "            'Zij_to1': [Zij_to1],\n",
        "            'Zij_to2': [Zij_to2]\n",
        "        })\n",
        "\n",
        "        new_row['p2t'] = [1 if Zij_to1 > Zij_to2 else 0]\n",
        "\n",
        "        input_df_np = pd.concat([input_df_np, new_row], ignore_index=True)"
      ],
      "metadata": {
        "id": "X7_04zIZ413c"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "input_df = pd.concat([input_df_pn, input_df_np], ignore_index=True)\n",
        "input_df['chat_gpt'] = np.nan\n",
        "input_df.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 258
        },
        "id": "c6-5jrDJ9g9T",
        "outputId": "7ac80ed8-38fc-48fc-b9bd-f174c4538085"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                          descriptor  \\\n",
              "0  Mechanical Treatment of Natural Fibrous or Fil...   \n",
              "1  Waveguides; Resonators, Lines, or Other Device...   \n",
              "2  Indexing Scheme Relating to Buttons, Pins, Buc...   \n",
              "3  Specific Uses or Applications of Nanostructure...   \n",
              "4  Machines, Apparatus or Devices for, or Methods...   \n",
              "\n",
              "                                                 to1  \\\n",
              "0               Spun | Thread | Cotton | Silk | Yarn   \n",
              "1  Rental | Broadcast | Apparatus | Equipment | R...   \n",
              "2   Japanese | Sandal-clogs | Style | Clogs | Wooden   \n",
              "3  Instrument | Medical | Apparatus | Diagnostic ...   \n",
              "4   Arrangement | Tour | Sightseeing | Travel | Book   \n",
              "\n",
              "                                                 to2    Zij_to1   Zij_to2  \\\n",
              "0  Grinders | Lipstick | Mascara | Agent | Antist...  27.573124 -0.062391   \n",
              "1            Beer | Drink | Alcohol | Ale | Beverage   9.382381 -0.301237   \n",
              "2       Car | Motor | Automobile | Railway | Vehicle   2.150680 -0.470676   \n",
              "3          Game | Machine | Apparatus | Play | Board   0.608825 -1.692199   \n",
              "4  Legal | Rights | License | Exploitation | Inte...   0.593654 -1.168645   \n",
              "\n",
              "   p2t  chat_gpt  \n",
              "0    1       NaN  \n",
              "1    1       NaN  \n",
              "2    1       NaN  \n",
              "3    1       NaN  \n",
              "4    1       NaN  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-0e97cea9-92c4-4602-b7c9-8126b4c4babe\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>descriptor</th>\n",
              "      <th>to1</th>\n",
              "      <th>to2</th>\n",
              "      <th>Zij_to1</th>\n",
              "      <th>Zij_to2</th>\n",
              "      <th>p2t</th>\n",
              "      <th>chat_gpt</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Mechanical Treatment of Natural Fibrous or Fil...</td>\n",
              "      <td>Spun | Thread | Cotton | Silk | Yarn</td>\n",
              "      <td>Grinders | Lipstick | Mascara | Agent | Antist...</td>\n",
              "      <td>27.573124</td>\n",
              "      <td>-0.062391</td>\n",
              "      <td>1</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Waveguides; Resonators, Lines, or Other Device...</td>\n",
              "      <td>Rental | Broadcast | Apparatus | Equipment | R...</td>\n",
              "      <td>Beer | Drink | Alcohol | Ale | Beverage</td>\n",
              "      <td>9.382381</td>\n",
              "      <td>-0.301237</td>\n",
              "      <td>1</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Indexing Scheme Relating to Buttons, Pins, Buc...</td>\n",
              "      <td>Japanese | Sandal-clogs | Style | Clogs | Wooden</td>\n",
              "      <td>Car | Motor | Automobile | Railway | Vehicle</td>\n",
              "      <td>2.150680</td>\n",
              "      <td>-0.470676</td>\n",
              "      <td>1</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Specific Uses or Applications of Nanostructure...</td>\n",
              "      <td>Instrument | Medical | Apparatus | Diagnostic ...</td>\n",
              "      <td>Game | Machine | Apparatus | Play | Board</td>\n",
              "      <td>0.608825</td>\n",
              "      <td>-1.692199</td>\n",
              "      <td>1</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Machines, Apparatus or Devices for, or Methods...</td>\n",
              "      <td>Arrangement | Tour | Sightseeing | Travel | Book</td>\n",
              "      <td>Legal | Rights | License | Exploitation | Inte...</td>\n",
              "      <td>0.593654</td>\n",
              "      <td>-1.168645</td>\n",
              "      <td>1</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0e97cea9-92c4-4602-b7c9-8126b4c4babe')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-0e97cea9-92c4-4602-b7c9-8126b4c4babe button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-0e97cea9-92c4-4602-b7c9-8126b4c4babe');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-b29ae3c4-4f3b-4269-adbd-3e982cf37805\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-b29ae3c4-4f3b-4269-adbd-3e982cf37805')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-b29ae3c4-4f3b-4269-adbd-3e982cf37805 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "input_df",
              "summary": "{\n  \"name\": \"input_df\",\n  \"rows\": 995,\n  \"fields\": [\n    {\n      \"column\": \"descriptor\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 496,\n        \"samples\": [\n          \"Making Nets by Knotting of Filamentary Material; Making Knotted Carpets or Tapestries; Knotting not otherwise provided for\",\n          \"Coding; Decoding; Code Conversion in General \",\n          \"Wind Motors\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"to1\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 493,\n        \"samples\": [\n          \"Polish | Removable | Wax | Floor | Strip\",\n          \"Resin | Semi | Semiworked | Membrane | Asbestos\",\n          \"Horn | Bassoon | Cellos | Kazoos | Kettledrum\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"to2\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 489,\n        \"samples\": [\n          \"Distributed | Transport | Supplied | Water | Gas\",\n          \"Health | Medical | Care | Assess | Information\",\n          \"Metal | Powder | Decor | Artist | Form\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Zij_to1\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 5.675350993341423,\n        \"min\": -30.0,\n        \"max\": 82.4017538091875,\n        \"num_unique_values\": 989,\n        \"samples\": [\n          0.0734000240848218,\n          0.322685475009825,\n          -2.08384844027708\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Zij_to2\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 6.335371118591971,\n        \"min\": -30.0,\n        \"max\": 87.3568152518451,\n        \"num_unique_values\": 982,\n        \"samples\": [\n          7.94467834472776,\n          3.37675585588414,\n          41.8238163839616\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"p2t\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 1,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          0,\n          1\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"chat_gpt\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": null,\n        \"min\": null,\n        \"max\": null,\n        \"num_unique_values\": 0,\n        \"samples\": [],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 39
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [],
      "metadata": {
        "id": "m4UptqIk_Zfh"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Function to generate Result (1 or 0) based on the descriptors\n",
        "def get_result(descriptor, to1, to2):\n",
        "    # Construct the prompt dynamically for each row\n",
        "    prompt = f\"\"\"\n",
        "    I give you a descriptor of a technology: {descriptor}.\n",
        "    Next, I give you the descriptors of two distinct market product keywords (stems): {to1} and {to2}.\n",
        "    If {descriptor} is more related to {to1}, give a response Result: 1; otherwise, give a response Result: 0.\n",
        "\n",
        "    Provide output as Result: 1 or Result: 0 without further token creation.\n",
        "    \"\"\"\n",
        "\n",
        "    while True:\n",
        "        try:\n",
        "            # Make a request GPT-4\n",
        "            response = openai.ChatCompletion.create(\n",
        "                model=\"gpt-4-turbo\",\n",
        "                messages=[\n",
        "                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
        "                    {\"role\": \"user\", \"content\": prompt}\n",
        "                ],\n",
        "                temperature=0  # Setting temperature to 0 for consistent, deterministic results\n",
        "            )\n",
        "            # Extract the response (Result: 1 or Result: 0)\n",
        "            result = response['choices'][0]['message']['content'].strip()\n",
        "            return result\n",
        "\n",
        "        except RateLimitError as e:\n",
        "            print(f\"Rate limit error encountered: {e}. Retrying after a delay...\")\n",
        "            # Wait before retrying, for example, waiting 20 seconds before retrying\n",
        "            time.sleep(20)\n",
        "\n",
        "# Iterate over each row in the DataFrame and update the chat_gpt column\n",
        "for index, row in input_df.iterrows():\n",
        "    descriptor = row['descriptor']\n",
        "    to1 = row['to1']\n",
        "    to2 = row['to2']\n",
        "\n",
        "    # Get the result for this row\n",
        "    result = get_result(descriptor, to1, to2)\n",
        "\n",
        "    # Assign the result to the 'chat_gpt' column of the DataFrame\n",
        "    input_df.at[index, 'chat_gpt'] = result\n",
        "\n",
        "    if index == 0 or (index % 50 == 0):\n",
        "        current_time = time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())\n",
        "        print(f\"Iteration: {index+1}, Timestamp: {current_time}\", flush=True)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "m9xFEK0r_tpG",
        "outputId": "52425308-ffdb-4cab-b3ca-19f93e669c83"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Iteration: 1, Timestamp: 2025-01-29 14:36:51\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-40-769b93e615d9>:42: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Result: 1' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n",
            "  input_df.at[index, 'chat_gpt'] = result\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Iteration: 51, Timestamp: 2025-01-29 14:37:32\n",
            "Iteration: 101, Timestamp: 2025-01-29 14:38:23\n",
            "Iteration: 151, Timestamp: 2025-01-29 14:38:58\n",
            "Iteration: 201, Timestamp: 2025-01-29 14:39:39\n",
            "Iteration: 251, Timestamp: 2025-01-29 14:40:16\n",
            "Iteration: 301, Timestamp: 2025-01-29 14:41:00\n",
            "Iteration: 351, Timestamp: 2025-01-29 14:41:37\n",
            "Iteration: 401, Timestamp: 2025-01-29 14:42:19\n",
            "Iteration: 451, Timestamp: 2025-01-29 14:42:55\n",
            "Iteration: 501, Timestamp: 2025-01-29 14:43:32\n",
            "Iteration: 551, Timestamp: 2025-01-29 14:44:10\n",
            "Iteration: 601, Timestamp: 2025-01-29 14:44:51\n",
            "Iteration: 651, Timestamp: 2025-01-29 14:45:27\n",
            "Iteration: 701, Timestamp: 2025-01-29 14:46:05\n",
            "Iteration: 751, Timestamp: 2025-01-29 14:46:43\n",
            "Iteration: 801, Timestamp: 2025-01-29 14:47:18\n",
            "Iteration: 851, Timestamp: 2025-01-29 14:48:13\n",
            "Iteration: 901, Timestamp: 2025-01-29 14:48:51\n",
            "Iteration: 951, Timestamp: 2025-01-29 14:49:32\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Extract only the from the 'chat_gpt' column\n",
        "input_df['chat_gpt'] = input_df['chat_gpt'].str.extract(r'Result:\\s*(.*)', expand=False)\n",
        "input_df['chat_gpt'] = input_df['chat_gpt'].str.extract(r'(\\d+)', expand=False)\n",
        "input_df['chat_gpt'] = input_df['chat_gpt'].astype(int)"
      ],
      "metadata": {
        "id": "wi17daZDmr1E"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Number of bootstrap samples\n",
        "n_bootstraps = 1000\n",
        "boot_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}\n",
        "\n",
        "# Perform bootstrapping\n",
        "for _ in range(n_bootstraps):\n",
        "    sample_df = resample(input_df, replace=True)\n",
        "\n",
        "    acc = accuracy_score(sample_df['p2t'], sample_df['chat_gpt'])\n",
        "    prec = precision_score(sample_df['p2t'], sample_df['chat_gpt'], zero_division=0)\n",
        "    rec = recall_score(sample_df['p2t'], sample_df['chat_gpt'])\n",
        "    f1 = f1_score(sample_df['p2t'], sample_df['chat_gpt'])\n",
        "\n",
        "    boot_metrics['accuracy'].append(acc)\n",
        "    boot_metrics['precision'].append(prec)\n",
        "    boot_metrics['recall'].append(rec)\n",
        "    boot_metrics['f1'].append(f1)\n",
        "\n",
        "# Compute mean and standard deviation for each metric\n",
        "performance_metrics = pd.DataFrame({\n",
        "    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],\n",
        "    'Mean': [\n",
        "        np.mean(boot_metrics['accuracy']),\n",
        "        np.mean(boot_metrics['precision']),\n",
        "        np.mean(boot_metrics['recall']),\n",
        "        np.mean(boot_metrics['f1'])\n",
        "    ],\n",
        "    'Std Dev': [\n",
        "        np.std(boot_metrics['accuracy']),\n",
        "        np.std(boot_metrics['precision']),\n",
        "        np.std(boot_metrics['recall']),\n",
        "        np.std(boot_metrics['f1'])\n",
        "    ]\n",
        "})\n",
        "\n",
        "# Generate confusion matrix\n",
        "conf_matrix = confusion_matrix(input_df['p2t'], input_df['chat_gpt'])\n",
        "\n",
        "# Create a confusion matrix DataFrame\n",
        "confusion_matrix_df = pd.DataFrame(\n",
        "    conf_matrix,\n",
        "    index=['Actual 0', 'Actual 1'],\n",
        "    columns=['Predicted 0', 'Predicted 1']\n",
        ")\n",
        "\n",
        "# Display performance metrics\n",
        "print(\"Performance Metrics:\")\n",
        "print(performance_metrics)\n",
        "\n",
        "# Display confusion matrix\n",
        "print(\"\\nConfusion Matrix:\")\n",
        "print(confusion_matrix_df)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lLUKZqFxkeTE",
        "outputId": "d9e40812-28e3-4b64-8fec-8640ba836552"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Performance Metrics:\n",
            "      Metric      Mean   Std Dev\n",
            "0   Accuracy  0.699853  0.014742\n",
            "1  Precision  0.682726  0.020317\n",
            "2     Recall  0.750175  0.019091\n",
            "3   F1-Score  0.714657  0.015753\n",
            "\n",
            "Confusion Matrix:\n",
            "          Predicted 0  Predicted 1\n",
            "Actual 0          321          174\n",
            "Actual 1          125          375\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "#  Validation 2\n",
        "We select 1,000 technology-Nice subclass pairs with Zij values above the 75th percentile of positive Zij values and supplemented them with additional Nice subclasses linked to the same technologies but having negative Zij values.\n",
        "\n"
      ],
      "metadata": {
        "id": "dvKqYcnKr_jS"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Calculate and create a dataframe based on the 75th percentile of positive Zij values.\n",
        "top_nth_percentile = p2t['Zij'].quantile(0.75)\n",
        "top_df = p2t[p2t['Zij'] >= top_nth_percentile]"
      ],
      "metadata": {
        "id": "6AeM2RZfkIK_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Find the common values in the CPC_4digit column between top_df and z_values_negative dataframes\n",
        "common_CPC_4digit = set(top_df['CPC_4digit']).intersection(set(z_values_negative['CPC_4digit']))\n",
        "\n",
        "# Filter both dataframes to keep only the rows with common CPC_4digit\n",
        "filtered_top_df = top_df[top_df['CPC_4digit'].isin(common_CPC_4digit)]"
      ],
      "metadata": {
        "id": "geptnGnllBfA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "input_df1 = pd.DataFrame(columns=['descriptor', 'to1', 'Zij_to1', 'to2', 'Zij_to2', 'p2t'])\n",
        "M = 500\n",
        "\n",
        "for _ in range(M):\n",
        "    loop_row = filtered_top_df.sample(n=1)\n",
        "    loop_row = loop_row[['CPC_4digit_label', 'Nice_subclass_keyword', 'Zij']].rename(\n",
        "        columns={'CPC_4digit_label': 'descriptor', 'Nice_subclass_keyword': 'to1', 'Zij': 'Zij_to1'}\n",
        "    )\n",
        "    matching_rows = z_values_negative[z_values_negative['CPC_4digit_label'] == loop_row['descriptor'].values[0]]\n",
        "\n",
        "    if not matching_rows.empty:\n",
        "        random_negative = matching_rows.sample(n=1)\n",
        "        random_negative = random_negative[['Nice_subclass_keyword', 'Zij']].rename(\n",
        "            columns={'Nice_subclass_keyword': 'to2', 'Zij': 'Zij_to2'}\n",
        "        )\n",
        "\n",
        "        loop_row = pd.concat([loop_row.reset_index(drop=True), random_negative.reset_index(drop=True)], axis=1)\n",
        "        loop_row['p2t'] = (loop_row['Zij_to1'] > loop_row['Zij_to2']).astype(int)\n",
        "        input_df1 = pd.concat([input_df1, loop_row], ignore_index=True)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bTi_WwSxnRDk",
        "outputId": "620dc99e-9ea2-444c-f387-067bd226bc17"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-47-40e2350d4e70>:19: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
            "  input_df1 = pd.concat([input_df1, loop_row], ignore_index=True)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "input_df2 = pd.DataFrame(columns=['descriptor', 'to1', 'Zij_to1', 'to2', 'Zij_to2', 'p2t'])\n",
        "M = 500\n",
        "\n",
        "for _ in range(M):\n",
        "    loop_row = z_values_negative.sample(n=1)\n",
        "    loop_row = loop_row[['CPC_4digit_label', 'Nice_subclass_keyword', 'Zij']].rename(\n",
        "        columns={'CPC_4digit_label': 'descriptor', 'Nice_subclass_keyword': 'to1', 'Zij': 'Zij_to1'}\n",
        "    )\n",
        "    matching_rows = filtered_top_df[filtered_top_df['CPC_4digit_label'] == loop_row['descriptor'].values[0]]\n",
        "\n",
        "    if not matching_rows.empty:\n",
        "        random_negative= matching_rows.sample(n=1)\n",
        "        random_negative = random_negative[['Nice_subclass_keyword', 'Zij']].rename(\n",
        "            columns={'Nice_subclass_keyword': 'to2', 'Zij': 'Zij_to2'}\n",
        "        )\n",
        "\n",
        "        loop_row = pd.concat([loop_row.reset_index(drop=True), random_negative.reset_index(drop=True)], axis=1)\n",
        "        loop_row['p2t'] = (loop_row['Zij_to1'] > loop_row['Zij_to2']).astype(int)\n",
        "        input_df2 = pd.concat([input_df2, loop_row], ignore_index=True)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6FTfWw_8tpAx",
        "outputId": "870cf0a5-9c47-46b0-f717-16db007cb3bf"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-48-8e8cc012e95f>:19: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
            "  input_df2 = pd.concat([input_df2, loop_row], ignore_index=True)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "input_df = pd.concat([input_df1, input_df2], ignore_index=True)\n",
        "input_df.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "BidHYn4muHrV",
        "outputId": "bf7d6730-60f4-47c4-e338-6c31b6a5880d"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                          descriptor  \\\n",
              "0    Walking Sticks; Umbrellas; Ladies' or Like Fans   \n",
              "1  Buildings or Like Structures for Particular Pu...   \n",
              "2                       Household or Table Equipment   \n",
              "3  Railway Systems; Equipment therefor not otherw...   \n",
              "4  Indexing Scheme Relating to Books, Filing Appl...   \n",
              "\n",
              "                                             to1    Zij_to1  \\\n",
              "0            Yarn | Silk | Knit | Natural | Wool   3.826451   \n",
              "1       Reagent | Water | Chemical | Test | Cell   3.419884   \n",
              "2  Armband | Collar | Food | Support | Appliques   4.940108   \n",
              "3     Vehicle | Rental | Tow | Motor | Transport   4.116292   \n",
              "4          Raw | Edible | Grain | Seaweed | Bean  11.989238   \n",
              "\n",
              "                                                 to2   Zij_to2 p2t  \n",
              "0            Food | Syrup | Starch | Flour | Glucose -0.379350   1  \n",
              "1         Rental | Hire | Room | Household | Kitchen -0.615719   1  \n",
              "2  Repellant | Insect | Impregnated | Animal | Ca... -0.963752   1  \n",
              "3                Gel | Cream | Skin | Medical | Care -0.509233   1  \n",
              "4  Feature | Token-operated | Wristwatches | Appa... -0.129941   1  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-d1702af7-30d9-45ee-80cc-a10524e3ccb7\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>descriptor</th>\n",
              "      <th>to1</th>\n",
              "      <th>Zij_to1</th>\n",
              "      <th>to2</th>\n",
              "      <th>Zij_to2</th>\n",
              "      <th>p2t</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Walking Sticks; Umbrellas; Ladies' or Like Fans</td>\n",
              "      <td>Yarn | Silk | Knit | Natural | Wool</td>\n",
              "      <td>3.826451</td>\n",
              "      <td>Food | Syrup | Starch | Flour | Glucose</td>\n",
              "      <td>-0.379350</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Buildings or Like Structures for Particular Pu...</td>\n",
              "      <td>Reagent | Water | Chemical | Test | Cell</td>\n",
              "      <td>3.419884</td>\n",
              "      <td>Rental | Hire | Room | Household | Kitchen</td>\n",
              "      <td>-0.615719</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Household or Table Equipment</td>\n",
              "      <td>Armband | Collar | Food | Support | Appliques</td>\n",
              "      <td>4.940108</td>\n",
              "      <td>Repellant | Insect | Impregnated | Animal | Ca...</td>\n",
              "      <td>-0.963752</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Railway Systems; Equipment therefor not otherw...</td>\n",
              "      <td>Vehicle | Rental | Tow | Motor | Transport</td>\n",
              "      <td>4.116292</td>\n",
              "      <td>Gel | Cream | Skin | Medical | Care</td>\n",
              "      <td>-0.509233</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Indexing Scheme Relating to Books, Filing Appl...</td>\n",
              "      <td>Raw | Edible | Grain | Seaweed | Bean</td>\n",
              "      <td>11.989238</td>\n",
              "      <td>Feature | Token-operated | Wristwatches | Appa...</td>\n",
              "      <td>-0.129941</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d1702af7-30d9-45ee-80cc-a10524e3ccb7')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-d1702af7-30d9-45ee-80cc-a10524e3ccb7 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-d1702af7-30d9-45ee-80cc-a10524e3ccb7');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-cce3a323-545f-43c9-bdb3-7077d59d7c7c\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-cce3a323-545f-43c9-bdb3-7077d59d7c7c')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-cce3a323-545f-43c9-bdb3-7077d59d7c7c button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "input_df",
              "summary": "{\n  \"name\": \"input_df\",\n  \"rows\": 761,\n  \"fields\": [\n    {\n      \"column\": \"descriptor\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 379,\n        \"samples\": [\n          \"Technical Subjects Covered by former Uspc Cross-Reference Art Collections [Xracs] And Digests\",\n          \"Methods or Apparatus for Generating or Transmitting Mechanical Vibrations of infrasonic, Sonic, or Ultrasonic Frequency, {e.g.} for Performing Mechanical Work in General\",\n          \"Soil Working in Agriculture or Forestry; Parts, Details, or Accessories of Agricultural Machines or Implements, in General\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"to1\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 426,\n        \"samples\": [\n          \"Racket | Tennis | Bat | Net | Racquet\",\n          \"Machine | Electric | Food | Kitchen | Household\",\n          \"Rental | Hire | Plant | Equipment | Apparatus\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Zij_to1\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 13.61619439973768,\n        \"min\": -30.0,\n        \"max\": 234.667967105517,\n        \"num_unique_values\": 751,\n        \"samples\": [\n          -0.0229174624191224,\n          12.5388756307431,\n          6.29516536366137\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"to2\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 412,\n        \"samples\": [\n          \"Cigarette | Electron | Atomic | Automata | Case\",\n          \"Stone | Natural | Concrete | Marble | Block\",\n          \"Hand | Drill | Blade | Cut | Implement]\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Zij_to2\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 10.383741851055328,\n        \"min\": -30.0,\n        \"max\": 98.8314476294398,\n        \"num_unique_values\": 735,\n        \"samples\": [\n          68.5602101374045,\n          -1.53661449378706,\n          -5.05074869888362\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"p2t\",\n      \"properties\": {\n        \"dtype\": \"date\",\n        \"min\": 0,\n        \"max\": 1,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          0,\n          1\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 51
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "input_df['chat_gpt'] = np.nan\n",
        "###\n",
        "# Function to generate Result (1 or 0) based on the descriptors\n",
        "def get_result(descriptor, to1, to2):\n",
        "    # Construct the prompt dynamically for each row\n",
        "    prompt = f\"\"\"\n",
        "    I give you a descriptor of a technology: {descriptor}.\n",
        "    Next, I give you the descriptors of two distinct market product keywords (stems): {to1} and {to2}.\n",
        "    If {descriptor} is more related to {to1}, give a response Result: 1; otherwise, give a response Result: 0.\n",
        "\n",
        "    Provide output as Result: 1 or Result: 0 without further token creation.\n",
        "    \"\"\"\n",
        "\n",
        "    while True:\n",
        "        try:\n",
        "            # Make a request to GPT-4\n",
        "            response = openai.ChatCompletion.create(\n",
        "                model=\"gpt-4-turbo\",\n",
        "                messages=[\n",
        "                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
        "                    {\"role\": \"user\", \"content\": prompt}\n",
        "                ],\n",
        "                temperature=0  # Setting temperature to 0 for consistent, deterministic results\n",
        "            )\n",
        "            # Extract the response (Result: 1 or Result: 0)\n",
        "            result = response['choices'][0]['message']['content'].strip()\n",
        "            return result\n",
        "\n",
        "        except RateLimitError as e:\n",
        "            print(f\"Rate limit error encountered: {e}. Retrying after a delay...\")\n",
        "            # Wait before retrying, for example, waiting 20 seconds before retrying\n",
        "            time.sleep(20)\n",
        "\n",
        "# Iterate over each row in the DataFrame and update the chat_gpt column\n",
        "for index, row in input_df.iterrows():\n",
        "    descriptor = row['descriptor']\n",
        "    to1 = row['to1']\n",
        "    to2 = row['to2']\n",
        "\n",
        "    # Get the result for this row\n",
        "    result = get_result(descriptor, to1, to2)\n",
        "\n",
        "    # Assign the result to the 'chat_gpt' column of the DataFrame\n",
        "    input_df.at[index, 'chat_gpt'] = result\n",
        "\n",
        "    if index == 0 or (index % 50 == 0):\n",
        "        current_time = time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())\n",
        "        print(f\"Iteration: {index+1}, Timestamp: {current_time}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GZdEUEb4syRh",
        "outputId": "545ebc30-3336-453c-e710-1f80cfce2d3d"
      },
      "execution_count": null,
      "outputs": [
        {
          "metadata": {
            "tags": null
          },
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "<ipython-input-52-9038ca8b1d50>:44: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Result: 1' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n",
            "  input_df.at[index, 'chat_gpt'] = result\n"
          ]
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Iteration: 1, Timestamp: 2025-01-29 15:06:19\n",
            "Iteration: 51, Timestamp: 2025-01-29 15:06:54\n",
            "Iteration: 101, Timestamp: 2025-01-29 15:07:31\n",
            "Iteration: 151, Timestamp: 2025-01-29 15:08:14\n",
            "Iteration: 201, Timestamp: 2025-01-29 15:09:04\n",
            "Iteration: 251, Timestamp: 2025-01-29 15:09:45\n",
            "Iteration: 301, Timestamp: 2025-01-29 15:10:21\n",
            "Iteration: 351, Timestamp: 2025-01-29 15:11:02\n",
            "Iteration: 401, Timestamp: 2025-01-29 15:11:40\n",
            "Iteration: 451, Timestamp: 2025-01-29 15:12:24\n",
            "Iteration: 501, Timestamp: 2025-01-29 15:12:59\n",
            "Iteration: 551, Timestamp: 2025-01-29 15:13:44\n",
            "Iteration: 601, Timestamp: 2025-01-29 15:14:22\n",
            "Iteration: 651, Timestamp: 2025-01-29 15:14:55\n",
            "Iteration: 701, Timestamp: 2025-01-29 15:15:38\n",
            "Iteration: 751, Timestamp: 2025-01-29 15:16:20\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Extract only the digits from the 'chat_gpt' column\n",
        "input_df['chat_gpt'] = input_df['chat_gpt'].str.extract(r'Result:\\s*(.*)', expand=False)\n",
        "input_df['chat_gpt'] = input_df['chat_gpt'].str.extract(r'(\\d+)', expand=False)"
      ],
      "metadata": {
        "id": "UrpYKIPktPl3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "input_df['chat_gpt'] = input_df['chat_gpt'].astype(int)\n",
        "input_df['p2t'] = input_df['p2t'].astype(int)\n",
        "input_df = input_df.dropna(subset=['p2t', 'chat_gpt'])"
      ],
      "metadata": {
        "id": "S9oGnMvstPVv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Number of bootstrap samples\n",
        "n_bootstraps = 1000\n",
        "boot_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}\n",
        "\n",
        "# Perform bootstrapping\n",
        "for _ in range(n_bootstraps):\n",
        "    sample_df = resample(input_df, replace=True)\n",
        "\n",
        "    acc = accuracy_score(sample_df['p2t'], sample_df['chat_gpt'])\n",
        "    prec = precision_score(sample_df['p2t'], sample_df['chat_gpt'], zero_division=0)\n",
        "    rec = recall_score(sample_df['p2t'], sample_df['chat_gpt'])\n",
        "    f1 = f1_score(sample_df['p2t'], sample_df['chat_gpt'])\n",
        "\n",
        "    boot_metrics['accuracy'].append(acc)\n",
        "    boot_metrics['precision'].append(prec)\n",
        "    boot_metrics['recall'].append(rec)\n",
        "    boot_metrics['f1'].append(f1)\n",
        "\n",
        "# Compute mean and standard deviation for each metric\n",
        "performance_metrics = pd.DataFrame({\n",
        "    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],\n",
        "    'Mean': [\n",
        "        np.mean(boot_metrics['accuracy']),\n",
        "        np.mean(boot_metrics['precision']),\n",
        "        np.mean(boot_metrics['recall']),\n",
        "        np.mean(boot_metrics['f1'])\n",
        "    ],\n",
        "    'Std Dev': [\n",
        "        np.std(boot_metrics['accuracy']),\n",
        "        np.std(boot_metrics['precision']),\n",
        "        np.std(boot_metrics['recall']),\n",
        "        np.std(boot_metrics['f1'])\n",
        "    ]\n",
        "})\n",
        "\n",
        "# Generate confusion matrix\n",
        "conf_matrix = confusion_matrix(input_df['p2t'], input_df['chat_gpt'])\n",
        "\n",
        "# Create a confusion matrix DataFrame\n",
        "confusion_matrix_df = pd.DataFrame(\n",
        "    conf_matrix,\n",
        "    index=['Actual 0', 'Actual 1'],\n",
        "    columns=['Predicted 0', 'Predicted 1']\n",
        ")\n",
        "\n",
        "# Display performance metrics\n",
        "print(\"Performance Metrics:\")\n",
        "print(performance_metrics)\n",
        "\n",
        "# Display confusion matrix\n",
        "print(\"\\nConfusion Matrix:\")\n",
        "print(confusion_matrix_df)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qFMR7ODhtPLl",
        "outputId": "1f797612-0d2a-40fa-cc84-e09aea909d4d"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Performance Metrics:\n",
            "      Metric      Mean   Std Dev\n",
            "0   Accuracy  0.791541  0.014471\n",
            "1  Precision  0.760278  0.020666\n",
            "2     Recall  0.846990  0.018692\n",
            "3   F1-Score  0.801098  0.015513\n",
            "\n",
            "Confusion Matrix:\n",
            "          Predicted 0  Predicted 1\n",
            "Actual 0          283          101\n",
            "Actual 1           58          319\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "#  Validation 3\n",
        "Validation 3 tests the strength of positive relationships. We begin by all pairs with Zij values greater than zero. From this set, we randomly selected 1,000 technology-Nice subclass pairs, those with Zij values above the 90th percentile (indicating strong positive relationships) and those below the 10th percentile (representing weak positive relationships)."
      ],
      "metadata": {
        "id": "YL1yI_Sb4eVo"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# calculate and create two dataframes based on the 10th and 90th percentiles of positive Zij values.\n",
        "top_nth_percentile = p2t['Zij'].quantile(0.9)\n",
        "bottom_nth_percentile = p2t['Zij'].quantile(0.1)\n",
        "top_df = p2t[p2t['Zij'] >= top_nth_percentile]\n",
        "bottom_df = p2t[p2t['Zij'] <= bottom_nth_percentile]"
      ],
      "metadata": {
        "id": "pWMd0vC55spS"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "stats_p2t = p2t['Zij'].agg(['min', 'max', 'mean', 'median', 'std'])\n",
        "stats_top_df = top_df['Zij'].agg(['min', 'max', 'mean', 'median', 'std'])\n",
        "stats_bottom_df = bottom_df['Zij'].agg(['min', 'max', 'mean', 'median', 'std'])\n",
        "all_stats = pd.DataFrame({\n",
        "    'p2t': stats_p2t,\n",
        "    'Top n% Zij': stats_top_df,\n",
        "    'Bottom n% Zij': stats_bottom_df\n",
        "})\n",
        "\n",
        "print(all_stats)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Ch-avbV2riw8",
        "outputId": "4a9c86ff-7c92-4784-8456-c9e17f4e5b58"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "               p2t  Top n% Zij  Bottom n% Zij\n",
            "min       0.000015    7.213342       0.000015\n",
            "max     234.667967  234.667967       0.207329\n",
            "mean      3.259779   17.387941       0.102301\n",
            "median    1.365613   12.347799       0.101614\n",
            "std       6.991583   15.627399       0.060118\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Find the common values in the CPC_4digit column between both dataframes\n",
        "common_CPC_4digit = set(top_df['CPC_4digit']).intersection(set(bottom_df['CPC_4digit']))\n",
        "\n",
        "# Filter both dataframes to keep only the rows with common CPC_4digit\n",
        "filtered_top_df = top_df[top_df['CPC_4digit'].isin(common_CPC_4digit)]\n",
        "filtered_bottom_df = bottom_df[bottom_df['CPC_4digit'].isin(common_CPC_4digit)]"
      ],
      "metadata": {
        "id": "QOV8GTFu6VNJ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "input_df1 = pd.DataFrame(columns=['descriptor', 'to1', 'Zij_to1', 'to2', 'Zij_to2', 'p2t'])\n",
        "M = 500\n",
        "\n",
        "for _ in range(M):\n",
        "    loop_row = filtered_top_df.sample(n=1)\n",
        "    loop_row = loop_row[['CPC_4digit_label', 'Nice_subclass_keyword', 'Zij']].rename(\n",
        "        columns={'CPC_4digit_label': 'descriptor', 'Nice_subclass_keyword': 'to1', 'Zij': 'Zij_to1'}\n",
        "    )\n",
        "    matching_rows = filtered_bottom_df[filtered_bottom_df['CPC_4digit_label'] == loop_row['descriptor'].values[0]]\n",
        "\n",
        "    if not matching_rows.empty:\n",
        "        random_bottom_row = matching_rows.sample(n=1)\n",
        "        random_bottom_row = random_bottom_row[['Nice_subclass_keyword', 'Zij']].rename(\n",
        "            columns={'Nice_subclass_keyword': 'to2', 'Zij': 'Zij_to2'}\n",
        "        )\n",
        "\n",
        "        loop_row = pd.concat([loop_row.reset_index(drop=True), random_bottom_row.reset_index(drop=True)], axis=1)\n",
        "        loop_row['p2t'] = (loop_row['Zij_to1'] > loop_row['Zij_to2']).astype(int)\n",
        "        input_df1 = pd.concat([input_df1, loop_row], ignore_index=True)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rFMtpgbc6YhP",
        "outputId": "d4faaf56-dfb3-4498-9f5c-a34689639b17"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-65-7698a3f7b990>:19: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
            "  input_df1 = pd.concat([input_df1, loop_row], ignore_index=True)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "input_df2 = pd.DataFrame(columns=['descriptor', 'to1', 'Zij_to1', 'to2', 'Zij_to2', 'p2t'])\n",
        "M = 500\n",
        "\n",
        "for _ in range(M):\n",
        "    loop_row = filtered_bottom_df.sample(n=1)\n",
        "    loop_row = loop_row[['CPC_4digit_label', 'Nice_subclass_keyword', 'Zij']].rename(\n",
        "        columns={'CPC_4digit_label': 'descriptor', 'Nice_subclass_keyword': 'to1', 'Zij': 'Zij_to1'}\n",
        "    )\n",
        "    matching_rows = filtered_top_df[filtered_top_df['CPC_4digit_label'] == loop_row['descriptor'].values[0]]\n",
        "\n",
        "    if not matching_rows.empty:\n",
        "        random_bottom_row = matching_rows.sample(n=1)\n",
        "        random_bottom_row = random_bottom_row[['Nice_subclass_keyword', 'Zij']].rename(\n",
        "            columns={'Nice_subclass_keyword': 'to2', 'Zij': 'Zij_to2'}\n",
        "        )\n",
        "\n",
        "        loop_row = pd.concat([loop_row.reset_index(drop=True), random_bottom_row.reset_index(drop=True)], axis=1)\n",
        "        loop_row['p2t'] = (loop_row['Zij_to1'] > loop_row['Zij_to2']).astype(int)\n",
        "        input_df2 = pd.concat([input_df2, loop_row], ignore_index=True)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pdEWRV2a7Cvu",
        "outputId": "db2ca00f-8b57-4b85-a42e-a27b1e1380e4"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-66-edb61db076f4>:19: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
            "  input_df2 = pd.concat([input_df2, loop_row], ignore_index=True)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "input_df = pd.concat([input_df1, input_df2], ignore_index=True)\n",
        "input_df.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "4YN6zLKO7dXU",
        "outputId": "398a8a77-fe89-4ffe-9827-776cfe9dc7a5"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                          descriptor  \\\n",
              "0  Specific Use of Cosmetics or Similar toilet Pr...   \n",
              "1  Spectacles; Sunglasses or Goggles insofar As t...   \n",
              "2  Vehicles, Vehicle Fittings, or Vehicle Parts, ...   \n",
              "3                      Planting; Sowing; Fertilising   \n",
              "4  Treatment of inorganic Materials, Other Than F...   \n",
              "\n",
              "                                                 to1    Zij_to1  \\\n",
              "0            Hair | Style | Color | Lotion | Protect  47.898220   \n",
              "1           Breeder | Massage | Fit | Farm | Eyebrow  15.957368   \n",
              "2  Part | Structural | Vehicle | Motorcycle | Bic...  11.665497   \n",
              "3          Seed | Tree | Propagation | Grass | Media  30.588004   \n",
              "4         Natural | Resin | Paint | Surface | Filled   7.442371   \n",
              "\n",
              "                                                 to2   Zij_to2 p2t  \n",
              "0       Lubricant | Chain | Agent | Cable | Conveyor  0.109723   1  \n",
              "1          Shower | Bath | Fit | Toilet | Sanitarium  0.070216   1  \n",
              "2  Disposable | Adoption | Arbitration | Babies |...  0.129122   1  \n",
              "3  Installation | Apparatus | Part | Water | Sani...  0.073924   1  \n",
              "4  Chemical | Composite | Water | Compound | Manu...  0.109701   1  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-e7c23024-386b-4421-adcd-adc286e70c60\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>descriptor</th>\n",
              "      <th>to1</th>\n",
              "      <th>Zij_to1</th>\n",
              "      <th>to2</th>\n",
              "      <th>Zij_to2</th>\n",
              "      <th>p2t</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Specific Use of Cosmetics or Similar toilet Pr...</td>\n",
              "      <td>Hair | Style | Color | Lotion | Protect</td>\n",
              "      <td>47.898220</td>\n",
              "      <td>Lubricant | Chain | Agent | Cable | Conveyor</td>\n",
              "      <td>0.109723</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Spectacles; Sunglasses or Goggles insofar As t...</td>\n",
              "      <td>Breeder | Massage | Fit | Farm | Eyebrow</td>\n",
              "      <td>15.957368</td>\n",
              "      <td>Shower | Bath | Fit | Toilet | Sanitarium</td>\n",
              "      <td>0.070216</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Vehicles, Vehicle Fittings, or Vehicle Parts, ...</td>\n",
              "      <td>Part | Structural | Vehicle | Motorcycle | Bic...</td>\n",
              "      <td>11.665497</td>\n",
              "      <td>Disposable | Adoption | Arbitration | Babies |...</td>\n",
              "      <td>0.129122</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Planting; Sowing; Fertilising</td>\n",
              "      <td>Seed | Tree | Propagation | Grass | Media</td>\n",
              "      <td>30.588004</td>\n",
              "      <td>Installation | Apparatus | Part | Water | Sani...</td>\n",
              "      <td>0.073924</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Treatment of inorganic Materials, Other Than F...</td>\n",
              "      <td>Natural | Resin | Paint | Surface | Filled</td>\n",
              "      <td>7.442371</td>\n",
              "      <td>Chemical | Composite | Water | Compound | Manu...</td>\n",
              "      <td>0.109701</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e7c23024-386b-4421-adcd-adc286e70c60')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-e7c23024-386b-4421-adcd-adc286e70c60 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-e7c23024-386b-4421-adcd-adc286e70c60');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-d228d3b2-6416-4743-bc7e-349249e5400d\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-d228d3b2-6416-4743-bc7e-349249e5400d')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-d228d3b2-6416-4743-bc7e-349249e5400d button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "input_df",
              "summary": "{\n  \"name\": \"input_df\",\n  \"rows\": 1000,\n  \"fields\": [\n    {\n      \"column\": \"descriptor\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 418,\n        \"samples\": [\n          \"Ladders\",\n          \"General Methods of organic Chemistry; Apparatus therefor \",\n          \"Functional Features or Details Common to Both Smallarms And\\r\\r\\nordnance, e.g. Cannons; Mountings for Smallarms or ordnance\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"to1\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 460,\n        \"samples\": [\n          \"Perfume | Fragrance | Scent | Water | Aromatherapeutical\",\n          \"Paint | Decor | Metal | Natural | Acrylamide\",\n          \"Drink | Soft | Flavor | Water | Carbon\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Zij_to1\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 13.966017288337989,\n        \"min\": 0.0002704121620077,\n        \"max\": 153.449638133642,\n        \"num_unique_values\": 933,\n        \"samples\": [\n          0.190614341682804,\n          8.65965558274397,\n          0.110673728615979\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"to2\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 439,\n        \"samples\": [\n          \"Sport | Article | Exercise | Apparatus | Gymnastic\",\n          \"Generates | Electric | Energisers | Power | Plant\",\n          \"Gravel | Ash | Ballast | Fair | Hutches\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Zij_to2\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 12.042626872061618,\n        \"min\": 0.0007728791271564,\n        \"max\": 129.28410248575,\n        \"num_unique_values\": 867,\n        \"samples\": [\n          0.167568609810533,\n          0.132178494794618,\n          0.155920690277493\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"p2t\",\n      \"properties\": {\n        \"dtype\": \"date\",\n        \"min\": 0,\n        \"max\": 1,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          0,\n          1\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 67
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "input_df['chat_gpt'] = np.nan\n",
        "# Function to generate Result (1 or 0) based on the descriptors\n",
        "def get_result(descriptor, to1, to2):\n",
        "    # Construct the prompt dynamically for each row\n",
        "    prompt = f\"\"\"\n",
        "    I give you a descriptor of a technology: {descriptor}.\n",
        "    Next, I give you the descriptors of two distinct market product keywords (stems): {to1} and {to2}.\n",
        "    If {descriptor} is more related to {to1}, give a response Result: 1; otherwise, give a response Result: 0.\n",
        "\n",
        "    Provide output as Result: 1 or Result: 0 without further token creation.\n",
        "    \"\"\"\n",
        "\n",
        "    while True:\n",
        "        try:\n",
        "            # Make a request GPT-4\n",
        "            response = openai.ChatCompletion.create(\n",
        "                model=\"gpt-4-turbo\",\n",
        "                messages=[\n",
        "                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
        "                    {\"role\": \"user\", \"content\": prompt}\n",
        "                ],\n",
        "                temperature=0  # Setting temperature to 0 for consistent, deterministic results\n",
        "            )\n",
        "            # Extract the response (Result: 1 or Result: 0)\n",
        "            result = response['choices'][0]['message']['content'].strip()\n",
        "            return result\n",
        "\n",
        "        except RateLimitError as e:\n",
        "            print(f\"Rate limit error encountered: {e}. Retrying after a delay...\")\n",
        "            # Wait before retrying, for example, waiting 20 seconds before retrying\n",
        "            time.sleep(20)\n",
        "\n",
        "# Iterate over each row in the DataFrame and update the chat_gpt column\n",
        "for index, row in input_df.iterrows():\n",
        "    descriptor = row['descriptor']\n",
        "    to1 = row['to1']\n",
        "    to2 = row['to2']\n",
        "\n",
        "    # Get the result for this row\n",
        "    result = get_result(descriptor, to1, to2)\n",
        "\n",
        "    # Assign the result to the 'chat_gpt' column of the DataFrame\n",
        "    input_df.at[index, 'chat_gpt'] = result\n",
        "\n",
        "    if index == 0 or (index % 50 == 0):\n",
        "        current_time = time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())\n",
        "        print(f\"Iteration: {index+1}, Timestamp: {current_time}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "diu_zF_E6ZG_",
        "outputId": "f2e4c562-9865-4d22-de94-3b462efbcacb"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-68-6320c2cc493b>:44: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Result: 1' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n",
            "  input_df.at[index, 'chat_gpt'] = result\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Iteration: 1, Timestamp: 2025-01-29 15:30:39\n",
            "Iteration: 51, Timestamp: 2025-01-29 15:31:18\n",
            "Iteration: 101, Timestamp: 2025-01-29 15:31:53\n",
            "Iteration: 151, Timestamp: 2025-01-29 15:32:34\n",
            "Iteration: 201, Timestamp: 2025-01-29 15:33:10\n",
            "Iteration: 251, Timestamp: 2025-01-29 15:33:46\n",
            "Iteration: 301, Timestamp: 2025-01-29 15:34:23\n",
            "Iteration: 351, Timestamp: 2025-01-29 15:35:13\n",
            "Iteration: 401, Timestamp: 2025-01-29 15:35:50\n",
            "Iteration: 451, Timestamp: 2025-01-29 15:36:27\n",
            "Iteration: 501, Timestamp: 2025-01-29 15:37:09\n",
            "Iteration: 551, Timestamp: 2025-01-29 15:37:53\n",
            "Iteration: 601, Timestamp: 2025-01-29 15:38:28\n",
            "Iteration: 651, Timestamp: 2025-01-29 15:39:06\n",
            "Iteration: 701, Timestamp: 2025-01-29 15:39:40\n",
            "Iteration: 751, Timestamp: 2025-01-29 15:40:20\n",
            "Iteration: 801, Timestamp: 2025-01-29 15:40:55\n",
            "Iteration: 851, Timestamp: 2025-01-29 15:41:32\n",
            "Iteration: 901, Timestamp: 2025-01-29 15:42:08\n",
            "Iteration: 951, Timestamp: 2025-01-29 15:42:47\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Extract only the digits from the 'chat_gpt' column\n",
        "input_df['chat_gpt'] = input_df['chat_gpt'].str.extract(r'Result:\\s*(.*)', expand=False)\n",
        "input_df['chat_gpt'] = input_df['chat_gpt'].str.extract(r'(\\d+)', expand=False)"
      ],
      "metadata": {
        "id": "Wr0ppXLM8MYV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "input_df['chat_gpt'] = input_df['chat_gpt'].astype(int)\n",
        "input_df['p2t'] = input_df['p2t'].astype(int)\n",
        "input_df = input_df.dropna(subset=['p2t', 'chat_gpt'])"
      ],
      "metadata": {
        "id": "rFae-ip68Q7F"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Number of bootstrap samples\n",
        "n_bootstraps = 1000\n",
        "boot_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}\n",
        "\n",
        "# Perform bootstrapping\n",
        "for _ in range(n_bootstraps):\n",
        "    sample_df = resample(input_df, replace=True)\n",
        "\n",
        "    acc = accuracy_score(sample_df['p2t'], sample_df['chat_gpt'])\n",
        "    prec = precision_score(sample_df['p2t'], sample_df['chat_gpt'], zero_division=0)\n",
        "    rec = recall_score(sample_df['p2t'], sample_df['chat_gpt'])\n",
        "    f1 = f1_score(sample_df['p2t'], sample_df['chat_gpt'])\n",
        "\n",
        "    boot_metrics['accuracy'].append(acc)\n",
        "    boot_metrics['precision'].append(prec)\n",
        "    boot_metrics['recall'].append(rec)\n",
        "    boot_metrics['f1'].append(f1)\n",
        "\n",
        "# Compute mean and standard deviation for each metric\n",
        "performance_metrics = pd.DataFrame({\n",
        "    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],\n",
        "    'Mean': [\n",
        "        np.mean(boot_metrics['accuracy']),\n",
        "        np.mean(boot_metrics['precision']),\n",
        "        np.mean(boot_metrics['recall']),\n",
        "        np.mean(boot_metrics['f1'])\n",
        "    ],\n",
        "    'Std Dev': [\n",
        "        np.std(boot_metrics['accuracy']),\n",
        "        np.std(boot_metrics['precision']),\n",
        "        np.std(boot_metrics['recall']),\n",
        "        np.std(boot_metrics['f1'])\n",
        "    ]\n",
        "})\n",
        "\n",
        "# Generate confusion matrix\n",
        "conf_matrix = confusion_matrix(input_df['p2t'], input_df['chat_gpt'])\n",
        "\n",
        "# Create a confusion matrix DataFrame\n",
        "confusion_matrix_df = pd.DataFrame(\n",
        "    conf_matrix,\n",
        "    index=['Actual 0', 'Actual 1'],\n",
        "    columns=['Predicted 0', 'Predicted 1']\n",
        ")\n",
        "\n",
        "# Display performance metrics\n",
        "print(\"Performance Metrics:\")\n",
        "print(performance_metrics)\n",
        "\n",
        "# Display confusion matrix\n",
        "print(\"\\nConfusion Matrix:\")\n",
        "print(confusion_matrix_df)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "guRIsudH6l5H",
        "outputId": "dd7c909d-8215-43eb-cb10-b42982e13349"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Performance Metrics:\n",
            "      Metric      Mean   Std Dev\n",
            "0   Accuracy  0.763849  0.013179\n",
            "1  Precision  0.705437  0.017932\n",
            "2     Recall  0.905755  0.012824\n",
            "3   F1-Score  0.792992  0.012805\n",
            "\n",
            "Confusion Matrix:\n",
            "          Predicted 0  Predicted 1\n",
            "Actual 0          311          189\n",
            "Actual 1           47          453\n"
          ]
        }
      ]
    }
  ]
}